#  coding:utf-8
# embedding + LR
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors
import numpy as np
import pickle
from sklearn.metrics import precision_score,recall_score,f1_score

def gen_x_y(corpus_path,w2v_path,emb_dim=200):
    '''

    :param corpus_path:
    :param w2v_path:
    :param emb_dim:
    :return: [[],[],[]] (n_samples,emb_dim),[[1],[0],[0]] (n_samples,1)
    '''
    w2v = KeyedVectors.load_word2vec_format(w2v_path)
    # 语料例子： 1 \t 我 是 菜鸡
    X,y = [],[]
    with open(corpus_path) as f:
        for line in f:
            sentence = []
            label = line.strip().split('\t')[0]
            # ['我','是'，'菜鸡']
            text = line.strip().split('\t')[1].split(' ')
            for word in text:
                try:
                    emb = w2v[word]
                except:
                    emb = np.random.random(size=emb_dim)
            # [我emb,是emb,菜鸡emb]
            sentence.append(emb)
            sentence = np.average(sentence)
            X.append(sentence)
            y.append([label])
    return X,y


def train(x,y,output_path):
    model = LogisticRegression()
    # [[x11,x21,x31],[x21,x22,x23]] (2,3)
    # [[1],[0]] (2,1)  [1,0] (2,)
    model.fit(x,y)
    pickle.dump(model,open(output_path,'wb'))

def test(x,y,model_path):
    model = pickle.load(open(model_path,'rb'))
    y_pred = model.predict(x) #(n_samples,emb_dim)
    p = precision_score(y,y_pred)
    r = recall_score(y,y_pred)
    f1 = f1_score(y,y_pred)
    print(p,r,f1)

if __name__ == '__main__':

    pass






